Load In Part VII Data

years <- 2009:2013
dat_7 <-  vector(mode = "list", length = length(years))
keep_cols_7 <- c("EIN", "TAXYR", "F9_07_COMP_DTK_TITLE", "first_name", "suffix", "status", "gender", "gender_confidence", "F9_07_COMP_DTK_COMP_ORG", "F9_07_COMP_DTK_EMPL_BEN", "F9_07_COMP_DTK_COMP_OTH")

for(i in 1:length(years)){
  link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/partvii/PARTVII-", years[i], "-PEOPLEPARSED.csv")
  temp <- fread(link, select = keep_cols_7) 
  colnames(temp) <- c("EIN", "year", "title", "first_name", "suffix", "status", "gender", "gender_confidence", "comp_main", "comp_benefit", "comp_other" )
  dat_7[[i]] <- temp
}

dat_all_7 <- rbindlist(dat_7)

dat_all_7 <-
  dat_all_7 %>% 
  filter(year <= max(years))

Load in Part I data

dat_1 <- vector(mode = "list", length = length(years))
keep_cols_1 <- c("ORG_EIN", "TAX_YEAR", "F9_01_ACT_GVRN_NUM_VOTE_MEMB", "F9_01_ACT_GVRN_NUM_VOTE_MEMB_IND")



for(i in 1:length(years)){
  link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/parsed/F9-P01-T00-SUMMARY-", years[i], ".csv")
  temp <- fread(link, select = keep_cols_1) 
  colnames(temp) <- c("EIN", "year", "num_vote_memb", "num_vote_memb_ind")
  dat_1[[i]] <- temp
}

dat_all_1 <- rbindlist(dat_1)

dat_all_1 <-
  dat_all_1 %>% 
  filter(year <= 2013)

Basic Statistics

Number of board members per year From Part VII

dat_all_7 %>% 
  group_by(EIN, year) %>%  ## Try grouping by URL
  summarise(num_members = n()) %>% 
  ungroup %>% 
  group_by(year) %>%
  summarise(avg_num_members = mean(num_members, na.rm=TRUE),
            sd_num_members = sd(num_members, na.rm= TRUE))
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
## # A tibble: 5 × 3
##    year avg_num_members sd_num_members
##   <int>           <dbl>          <dbl>
## 1  2009            14.5           15.2
## 2  2010            11.2           12.5
## 3  2011            11.0           13.7
## 4  2012            10.8           13.1
## 5  2013            10.6           11.7
dat_all_7 %>% 
  group_by(EIN, year) %>% 
  summarise(num_members = n()) %>% 
  ungroup %>%
  group_by(year) %>%
  ggplot(aes(x = num_members, y = after_stat(count),
             group = as.factor(year), color = as.factor(year) )) +
  geom_density(adjust = 2) +
  xlim(0, 50) +
  ggtitle("Number of Voting Members Listed on Part VII") +
  guides(color= guide_legend(title="Year"))+ 
  xlab("Number of Members") +
  ylab("Frequency")
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
## Warning: Removed 8916 rows containing non-finite values (`stat_density()`).

Number of board members per year From Part I

#~34% of "Number of Members" are NA
mean(is.na(dat_all_1$num_vote_memb))
## [1] 0.3406153
#~34% of "Number of Independent Members" are NA
mean(is.na(dat_all_1$num_vote_memb_ind))
## [1] 0.3406153
dat_all_1 %>% 
  group_by(year) %>%
  summarise(avg_num_members = mean(num_vote_memb, na.rm=TRUE),
            sd_num_members = sd(num_vote_memb, na.rm= TRUE),
            avg_num_members_indep = mean(num_vote_memb_ind, na.rm=TRUE),
            sd_num_members_indep = sd(num_vote_memb_ind, na.rm= TRUE))
## # A tibble: 5 × 5
##    year avg_num_members sd_num_members avg_num_members_indep
##   <int>           <dbl>          <dbl>                 <dbl>
## 1  2009            17.7           79.2                  17.2
## 2  2010            21.4          318.                   20.1
## 3  2011            20.5          294.                   19.2
## 4  2012            20.7          403.                   18.6
## 5  2013            19.4          250.                   19.9
## # ℹ 1 more variable: sd_num_members_indep <dbl>

These numbers do not agree with the PART VII summary statistics.

dat_all_1 %>% 
  na.omit() %>%
  pivot_longer(!c(EIN, year), names_to = "type", values_to = "num") %>% 
  ggplot(aes(x = num, y = ..count..,
             color = as.factor(type))) +
  geom_density(adjust = 2) +
  facet_wrap(~year) +
  xlim(0, 50) +
  ggtitle("Number of Voting Members Listed on Part I") +
  guides(color=guide_legend(title="Type")) +
  xlab("Number of Members") +
  ylab("Frequency")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 29105 rows containing non-finite values (`stat_density()`).

dat_all_1 %>% 
  na.omit() %>% 
  filter(num_vote_memb < 100, 
         num_vote_memb_ind < 100) %>%
  ggplot(aes(x = num_vote_memb, y = num_vote_memb_ind )) +
  geom_point(alpha = 0.05) +
  facet_wrap(~year) +
  ggtitle("Voting Members vs. Independent Voting Members") +
  xlab("Number of Voting Members") +
  ylab("Number of Independent Voting Members")

Gender Percentages

dat_all_7 %>% 
  group_by(EIN, year, gender) %>% 
  summarise(num_members = n() ) %>%
  ungroup %>%
  group_by(year, gender) %>%
  summarise(avg_num_members = mean(num_members, na.rm = TRUE)) %>% 
  arrange(desc(gender)) %>%
  ggplot(aes(x = year, y = avg_num_members, fill = fct_inorder(gender))) +
  geom_col(position="stack") +
  
  ggtitle("Average Board Members By Gender") +
  xlab("Year") +
  ylab("Average Number of Members")  +
  guides(fill=guide_legend(title="Gender"))
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

dat_all_7 %>% 
  group_by(EIN, year, gender) %>% 
  summarise(num_members = n() ) %>%
  ungroup %>%
  group_by(year, gender) %>%
  filter(gender != "U") %>% 
  summarise(avg_num_members = mean(num_members, na.rm = TRUE)) %>% 
  arrange(desc(gender)) %>%
  mutate(avg_num_members = round(avg_num_members, 2)) %>%
  ggplot(aes(x = as.factor(year), y = avg_num_members, fill = fct_inorder(gender))) +
  geom_col(position="fill") +
  #geom_text(aes(label = avg_num_members), position = position_fill(vjust = 0.5)) +
  ggtitle("Percentage of Board Members By Gender") +
  xlab("Year") +
  ylab("Percentage of Members") +
  guides(fill=guide_legend(title="Gender")) 
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

dat_all_7 %>% 
  group_by(EIN, year, gender) %>% 
  summarise(num_members = n() ) %>%
  ungroup %>%
  group_by(year, gender) %>%
  filter(gender != "U") %>% 
  summarise(avg_num_members = mean(num_members, na.rm = TRUE)) %>% 
  arrange(desc(gender)) %>%
  mutate(avg_num_members = round(avg_num_members, 2)) %>%
  ggplot(aes(x = as.factor(year), y = avg_num_members, fill = fct_inorder(gender))) +
  geom_col(position="fill") +
  #geom_text(aes(label = avg_num_members), position = position_fill(vjust = 0.5)) +
  ggtitle("Percentage of Board Members By Gender") +
  xlab("Year") +
  ylab("Percentage of Members") +
  guides(fill=guide_legend(title="Gender")) 
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Change in gender percentage make up of the boards

dat_all_7 %>% 
  select(EIN, year, gender) %>% 
  group_by(EIN, year, gender) %>% 
  summarise(num_members = n() ) %>%
  ungroup %>% 
  pivot_wider(names_from = gender, values_from = num_members, values_fill = 0) %>% 
  rowwise() %>% 
  mutate(total = sum(M, U, `F`)) %>% 
  mutate(pM = M / total,
         pU = U / total,
         pF = `F` / total) %>% 
  group_by(year) %>% 
  summarise(M = mean(pM),
            U = mean(pU),
            `F` = mean(pF))  %>% 
  pivot_longer(!year, names_to = "gender", values_to = "avg_perc") %>% 
  mutate(avg_perc = round(avg_perc, 2)) %>%
  ggplot(aes(x = as.factor(year), y = avg_perc, fill = fct_inorder(gender))) +
  geom_col(position="fill") +
  geom_text(aes(label = avg_perc), position = position_fill(vjust = 0.5)) +
  ggtitle("Average Gener Percentage of Board Members") +
  xlab("Year") +
  ylab("Percentage of Members") +
  guides(fill=guide_legend(title="Gender")) 
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.

dat_all_7 %>% 
  select(EIN, year, gender) %>% 
  group_by(EIN, year, gender) %>% 
  summarise(num_members = n() ) %>%
  ungroup %>% 
  pivot_wider(names_from = gender, values_from = num_members, values_fill = 0) %>% 
  rowwise() %>% 
  mutate(total = sum(M, `F`)) %>% 
  mutate(pM = M / total,
         pF = `F` / total) %>% 
  group_by(year) %>% 
  summarise(M = mean(pM, na.rm = T),
            `F` = mean(pF, na.rm = T))  %>% 
  pivot_longer(!year, names_to = "gender", values_to = "avg_perc") %>% 
  mutate(avg_perc = round(avg_perc, 2)) %>%
  ggplot(aes(x = as.factor(year), y = avg_perc, fill = fct_inorder(gender))) +
  geom_col(position="fill") +
  geom_text(aes(label = avg_perc), position = position_fill(vjust = 0.5)) +
  ggtitle("Average Gener Percentage of Board Members") +
  xlab("Year") +
  ylab("Percentage of Members") +
  guides(fill=guide_legend(title="Gender")) 
## `summarise()` has grouped output by 'EIN', 'year'. You can override using the
## `.groups` argument.

##with in org gender change 
dat_all_7 %>% 
  filter(year %in% c(2010, 2013)) %>% 
  filter(gender != "U") %>%
  select(EIN, year, gender) %>% 
  group_by(EIN, year) %>% 
  summarise(mm = mean(gender == "M"))  %>% 
  ungroup() %>% 
  pivot_wider(names_from = year, values_from = mm) %>% 
  na.omit() %>%
  mutate(change = `2013` - `2010` ) %>% 
  pull(change) %>% 
  summary()
## `summarise()` has grouped output by 'EIN'. You can override using the `.groups`
## argument.
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.000000 -0.062500  0.000000 -0.005956  0.045455  1.000000

Exploring some of the “U” genders + suffix

Conclusion: Suffix will not be helpful for “U”. All the suffix’s are JR, II, III, etc. They are not gendered.

#first look at all of the people with "U" to see if they have a suffix
dat_all_7 %>% 
  filter(gender == "U") %>% 
  filter(nchar(suffix) > 0) %>% 
  pull(suffix) %>% 
  table()
## .
##     II    III     IV     JR SENIOR     SR 
##    528   2537    282   4831      3    501

Looking at Orgs with more independent members than members

Find such orgs

dat.more <- 
  dat_all_1 %>% 
  filter(num_vote_memb_ind > num_vote_memb)

dat.more %>% 
  ggplot(aes(x = num_vote_memb , y = num_vote_memb_ind)) + 
  geom_point() +
  xlab("Number of Members") +
  ylab("Number of Independent Members") +
  ggtitle("Orgs with more independent members than members")

dat.more %>% 
  filter(num_vote_memb < 100) %>%
  filter(num_vote_memb_ind < 500) %>% 
  ggplot(aes(x = num_vote_memb , y = num_vote_memb_ind)) + 
  geom_point() +
  xlab("Number of Members") +
  ylab("Number of Independent Members") +
  ggtitle("Orgs with more independent members than members")

Let’s investiage a few of them

set.seed(100)
dat.more %>% 
  filter(num_vote_memb < 20) %>%
  filter(num_vote_memb_ind < 100) %>% 
  sample_n(1)
##          EIN year num_vote_memb num_vote_memb_ind
## 1: 201736709 2010             6                27
dat.more %>% 
  filter(num_vote_memb > 100) %>%
  filter(num_vote_memb_ind > 200) %>% 
  sample_n(1)
##          EIN year num_vote_memb num_vote_memb_ind
## 1: 596583560 2012           397               486
dat.more %>% 
  filter(num_vote_memb < 100) %>%
  filter(num_vote_memb_ind > 200) %>% 
  sample_n(1)
##          EIN year num_vote_memb num_vote_memb_ind
## 1: 710224574 2012            13               323